In [5]:
import os
os.environ['HF_HOME'] = '/scratch/fd2264/cache'
from scipy.cluster import hierarchy as sch
import sys
sys.path.insert(1, '../src/')
from utils import clean_text,Text_Topic_Extractor,set_topic_explainer_pipe,sentence_split_nltk,get_word_count, compute_and_save_embeddings, set_finetuned_topic_explainer
from visutils import plot_topic_across_time
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
#python -m spacy download es_core_news_sm
/ext3/miniconda3/lib/python3.12/site-packages/dask/dataframe/__init__.py:31: FutureWarning: Dask dataframe query planning is disabled because dask-expr is not installed. You can install it with `pip install dask[dataframe]` or `conda install dask`. This will raise in a future version. warnings.warn(msg, FutureWarning)
In [11]:
%load_ext autoreload
%autoreload 2
Cleaning More and splitting the data into sentences¶
In [6]:
interventions = pd.read_json('../data/clean/interventions.json', orient='records')
interventions.speech = interventions.speech.apply(clean_text)
interventions['Document'] = interventions.speech.apply(sentence_split_nltk)
phrase_df = interventions.explode('Document')
phrase_df = phrase_df.drop(["speech", "word_count", "sentence_count"],axis=1)
phrase_df['word_count'] = phrase_df.Document.apply(get_word_count)
phrase_df = phrase_df[phrase_df['word_count']>15].reset_index()
speeches = phrase_df.Document
Computing the embeddings if needed¶
In [7]:
load = True
# embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
approach = "sentence_NLTK"
if(not load):
embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
compute_and_save_embeddings(embedding_model, approach)
else:
with open(f'../data/approach_{approach}/5d_embeddings.npy', 'rb') as f:
reduced_embeddings = np.load(f)
with open(f'../data/approach_{approach}/2d_embeddings.npy', 'rb') as f:
reduced_embeddings_2d = np.load(f)
with open(f'../data/approach_{approach}/clusters.npy', 'rb') as f:
clusters = np.load(f)
with open(f'../data/approach_{approach}/embeddings.npy', 'rb') as f:
embeddings = np.load(f)
Defining the topic extractor and loading¶
In [8]:
load_folder = '../data/approach_sentence_NLTK/'
topic_extractor = Text_Topic_Extractor(text_list = speeches,
clusters = load_folder + 'clusters.npy' ,
embeddings = load_folder + 'embeddings.npy',
reduced_embeddings = load_folder + '5d_embeddings.npy',
reduced_embeddings_2d = load_folder + '2d_embeddings.npy',
min_topic_size = 10 )
If you don't have a topic_extractor serialized you have to fit it in your documents
In [9]:
#uncomment to run fit the model
#topic_explainer_pipe, tokenizer = set_topic_explainer_pipe()
#topic_extractor.fit(topic_explainer_pipe = topic_explainer_pipe )
#topic_extractor.save(folder_path = 'saved_models/per_sentence/')
topic_extractor.load(topic_model_path = 'saved_models/per_sentence/', topic_info_path = 'saved_models/per_sentence/topic_info.pkl' )
In [10]:
topic_extractor.clean_labels()
100%|██████████| 254/254 [00:00<00:00, 309156.48it/s]
Visualizations¶
Visualizing cluster on the embedding space¶
Using Visualize_2d_cluster we can see how topic is distributed on the embedding space. To avoid a too heavy plot, use hide_document_hover = True. If you set hide_document_hover = False, it will show the content of each document when you hover over it.
In [10]:
topic_extractor.visualize_2d_clusters(hide_document_hover = True, custom_labels = True, top_n = 10,hide_annotations = True)